library(tidyverse)
library(janitor)
library(GGally)
kc_house <- read_csv("data/kc_house_data.csv")
kc_house
house_clean <- kc_house %>%
select(-c(id, date, sqft_living, sqft_lot, sqft_living15,sqft_lot15, zipcode, lat, long))
house_clean
house_clean %>%
summarise(across(.cols = everything(), .fns = ~ sum(is.na(.))))
changing grade to 5 categories from 1 to 13
house_clean <- house_clean %>%
mutate(grade = case_when(
grade > 0 & grade <= 3 ~ "falls short",
grade > 3 & grade <= 6 ~ "below average",
grade == 7 ~ "average",
grade > 7 & grade <= 10 ~ "above average",
TRUE ~ "high quality"
))
changing waterfront to logical variable
house_clean <- house_clean %>%
mutate(waterfront = if_else(waterfront == 0, FALSE, TRUE))
house_clean
changing yr_renovated to logical variable
house_clean <- house_clean %>%
mutate(renovated = if_else(yr_renovated == 0, FALSE, TRUE)) %>%
select(-yr_renovated)
house_clean
# Question 2
model <- lm(price ~ ., data = house_clean)
model %>%
alias()
## Model :
## price ~ bedrooms + bathrooms + floors + waterfront + view + condition +
## grade + sqft_above + sqft_basement + yr_built + renovated
summary(model)
##
## Call:
## lm(formula = price ~ ., data = house_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1614080 -112764 -9605 90843 3987682
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.214e+06 1.467e+05 42.358 < 2e-16 ***
## bedrooms -3.980e+04 2.085e+03 -19.091 < 2e-16 ***
## bathrooms 4.821e+04 3.588e+03 13.436 < 2e-16 ***
## floors 4.034e+04 3.851e+03 10.474 < 2e-16 ***
## waterfrontTRUE 5.400e+05 1.910e+04 28.279 < 2e-16 ***
## view 5.048e+04 2.302e+03 21.926 < 2e-16 ***
## condition 2.084e+04 2.562e+03 8.134 4.39e-16 ***
## gradeaverage -1.028e+05 4.057e+03 -25.338 < 2e-16 ***
## gradebelow average -1.615e+05 6.566e+03 -24.601 < 2e-16 ***
## gradefalls short -1.033e+05 1.110e+05 -0.931 0.35203
## gradehigh quality 4.869e+05 1.152e+04 42.280 < 2e-16 ***
## sqft_above 2.088e+02 3.249e+00 64.273 < 2e-16 ***
## sqft_basement 1.991e+02 4.638e+00 42.934 < 2e-16 ***
## yr_built -3.132e+03 7.365e+01 -42.522 < 2e-16 ***
## renovatedTRUE 2.808e+04 8.016e+03 3.503 0.00046 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 221500 on 21598 degrees of freedom
## Multiple R-squared: 0.6363, Adjusted R-squared: 0.636
## F-statistic: 2699 on 14 and 21598 DF, p-value: < 2.2e-16
Question 3
houses_tidy_numeric <- house_clean %>%
select_if(is.numeric)
houses_tidy_nonnumeric <- house_clean %>%
select_if(function(x) !is.numeric(x))
houses_tidy_nonnumeric$price <- house_clean$price
ggpairs(houses_tidy_numeric)
ggpairs(houses_tidy_nonnumeric)
Now wil build a regression model containing the four main effects with price
mod1a <- lm(price ~ renovated, data = house_clean)
summary(mod1a)
##
## Call:
## lm(formula = price ~ renovated, data = house_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -650379 -215361 -85361 104639 6939621
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 530361 2532 209.51 <2e-16 ***
## renovatedTRUE 230018 12310 18.69 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 364200 on 21611 degrees of freedom
## Multiple R-squared: 0.0159, Adjusted R-squared: 0.01585
## F-statistic: 349.2 on 1 and 21611 DF, p-value: < 2.2e-16
mod2a <- lm(price ~ grade, data = house_clean)
summary(mod2a)
##
## Call:
## lm(formula = price ~ grade, data = house_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1258635 -147590 -43590 96410 6021365
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 665392 2912 228.466 < 2e-16 ***
## gradeaverage -262802 4214 -62.370 < 2e-16 ***
## gradebelow average -370168 6674 -55.462 < 2e-16 ***
## gradefalls short -475642 144313 -3.296 0.000983 ***
## gradehigh quality 1013243 13205 76.734 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 288600 on 21608 degrees of freedom
## Multiple R-squared: 0.3823, Adjusted R-squared: 0.3822
## F-statistic: 3343 on 4 and 21608 DF, p-value: < 2.2e-16
mod3a <- lm(price ~ bedrooms, data = house_clean)
summary(mod3a)
##
## Call:
## lm(formula = price ~ bedrooms, data = house_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3506435 -203235 -66667 105049 6839901
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 129802 8932 14.53 <2e-16 ***
## bedrooms 121716 2554 47.65 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 349200 on 21611 degrees of freedom
## Multiple R-squared: 0.09508, Adjusted R-squared: 0.09504
## F-statistic: 2271 on 1 and 21611 DF, p-value: < 2.2e-16
the most effect for the price as R-squared 95%
mod4a <- lm(price ~ floors, data = house_clean)
summary(mod4a)
##
## Call:
## lm(formula = price ~ floors, data = house_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -597965 -203837 -73787 103213 6984329
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 279198 7102 39.31 <2e-16 ***
## floors 174589 4470 39.06 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 354800 on 21611 degrees of freedom
## Multiple R-squared: 0.06594, Adjusted R-squared: 0.0659
## F-statistic: 1526 on 1 and 21611 DF, p-value: < 2.2e-16